library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
geom_point() +
xlim(0,60) +
ylim(0,25)
## Warning: Removed 2 rows containing missing values (geom_point).
Section 5.1: Billboard
## install.packages("billboard")
library(billboard)
head(wiki_hot_100s)
## no title artist year
## 1 1 Theme from A Summer Place Percy Faith 1960
## 2 2 He'll Have to Go Jim Reeves 1960
## 3 3 Cathy's Clown The Everly Brothers 1960
## 4 4 Running Bear Johnny Preston 1960
## 5 5 Teen Angel Mark Dinning 1960
## 6 6 I'm Sorry Brenda Lee 1960
tail(wiki_hot_100s)
## no title artist year
## 5696 95 Adventure of a Lifetime Coldplay 2016
## 5697 96 Humble and Kind Tim McGraw 2016
## 5698 97 Wicked Future 2016
## 5699 98 Tiimmy Turner Desiigner 2016
## 5700 99 See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100 Perfect One Direction 2016
max(wiki_hot_100s$year)
## [1] "2016"
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(tidyverse)
top10 <- wiki_hot_100s %>%
filter(year >= 2000 & year <= 2009) %>%
group_by(artist) %>%
summarise(nsongs = n()) %>%
arrange(desc(nsongs)) %>%
slice(1:10) %>%
mutate(artist = fct_reorder(artist, nsongs))
ggplot(data = top10, aes(x = artist, y = nsongs)) +
geom_bar(stat = "identity") +
coord_flip()
5.1.1 Exercises
Exercise 2. There is a minor flaw in the way that we counted up the number of hits for each artist. Examine the 2nd to last row of the original data set with tail() to look at this potential flaw. What do you find?
tail(wiki_hot_100s)
## no title artist year
## 5696 95 Adventure of a Lifetime Coldplay 2016
## 5697 96 Humble and Kind Tim McGraw 2016
## 5698 97 Wicked Future 2016
## 5699 98 Tiimmy Turner Desiigner 2016
## 5700 99 See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100 Perfect One Direction 2016
### you find that there are two artists on that one song, which was not taken into account when we added the number of hits for the artist. This would make it so these two artists would lose a song due to this one not counting because the artist is different.
Exercise 4. Change the plot from Exercise 1 to be a Lollipop chart using this website as a reference. Why might the lollipop chart be better than a bar plot?
ggplot(data = top10, aes(x = artist, y = nsongs)) +
geom_point() +
geom_segment(aes(x = artist, xend = artist, y = 0, yend = nsongs)) +
coord_flip()
A lollipop chart might be better than a bar plot because there is a point on the total number of songs they have, so it may be easier to see the output and compare between artists.
Exercise 5. Use this website to customize the end points of your lollipop chart. If you have time, you can explore the other customization options. Make it look fancy!
ggplot(data = top10, aes(x = artist, y = nsongs)) +
geom_point(size = 2, color = "red", fill = alpha("red", 0.3), alpha = 0.7, shape = 21, stroke = 2) +
geom_segment(aes(x = artist, xend = artist, y = 0, yend = nsongs)) +
coord_flip()
## provide the URL and name it something (in this case, url).
## paste0 pastes together the base URL and the year into a single string:
## this will be useful in a moment
year <- 2017
## convert the html code into something R can read
webpage <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>%
httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>%
read_html()
## grabs the tables
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
mutate(year = 2017)
df
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2017
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2017
## 3 3 "\"That's What I Like\"" Bruno Mars 2017
## 4 4 "\"Humble\"" Kendrick Lamar 2017
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2017
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2017
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2017
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2017
## 9 9 "\"Believer\"" Imagine Dragons 2017
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2017
## # … with 90 more rows
get_wiki_100 <- function(year) {
## same code as before, replacing 2017 with year.
webpage <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>%
httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>%
read_html()
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
mutate(year = year)
## tell our function to return the dataframe `df`
return(df)
}
get_wiki_100(year = 2017)
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2017
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2017
## 3 3 "\"That's What I Like\"" Bruno Mars 2017
## 4 4 "\"Humble\"" Kendrick Lamar 2017
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2017
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2017
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2017
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2017
## 9 9 "\"Believer\"" Imagine Dragons 2017
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2017
## # … with 90 more rows
library(purrr)
year_list <- list(2017, 2018, 2019, 2020, 2021)
year_list
## [[1]]
## [1] 2017
##
## [[2]]
## [1] 2018
##
## [[3]]
## [1] 2019
##
## [[4]]
## [1] 2020
##
## [[5]]
## [1] 2021
df_all <- map(year_list, get_wiki_100)
df_all ## a list of data frames, one for each year
## [[1]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2017
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2017
## 3 3 "\"That's What I Like\"" Bruno Mars 2017
## 4 4 "\"Humble\"" Kendrick Lamar 2017
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2017
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2017
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2017
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2017
## 9 9 "\"Believer\"" Imagine Dragons 2017
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2017
## # … with 90 more rows
##
## [[2]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"God's Plan\"" Drake 2018
## 2 2 "\"Perfect\"" Ed Sheeran 2018
## 3 3 "\"Meant to Be\"" Bebe Rexha featuring Florida Georgia Line 2018
## 4 4 "\"Havana\"" Camila Cabello featuring Young Thug 2018
## 5 5 "\"Rockstar\"" Post Malone featuring 21 Savage 2018
## 6 6 "\"Psycho\"" Post Malone featuring Ty Dolla Sign 2018
## 7 7 "\"I Like It\"" Cardi B, Bad Bunny and J Balvin 2018
## 8 8 "\"The Middle\"" Zedd, Maren Morris and Grey 2018
## 9 9 "\"In My Feelings\"" Drake 2018
## 10 10 "\"Girls Like You\"" Maroon 5 featuring Cardi B 2018
## # … with 90 more rows
##
## [[3]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Old Town Road\"" Lil Nas X featuring Billy Ray Cyrus 2019
## 2 2 "\"Sunflower\"" Post Malone and Swae Lee 2019
## 3 3 "\"Without Me\"" Halsey 2019
## 4 4 "\"Bad Guy\"" Billie Eilish 2019
## 5 5 "\"Wow\"" Post Malone 2019
## 6 6 "\"Happier\"" Marshmello and Bastille 2019
## 7 7 "\"7 Rings\"" Ariana Grande 2019
## 8 8 "\"Talk\"" Khalid 2019
## 9 9 "\"Sicko Mode\"" Travis Scott 2019
## 10 10 "\"Sucker\"" Jonas Brothers 2019
## # … with 90 more rows
##
## [[4]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Blinding Lights\"" The Weeknd 2020
## 2 2 "\"Circles\"" Post Malone 2020
## 3 3 "\"The Box\"" Roddy Ricch 2020
## 4 4 "\"Don't Start Now\"" Dua Lipa 2020
## 5 5 "\"Rockstar\"" DaBaby featuring Roddy Ricch 2020
## 6 6 "\"Adore You\"" Harry Styles 2020
## 7 7 "\"Life Is Good\"" Future featuring Drake 2020
## 8 8 "\"Memories\"" Maroon 5 2020
## 9 9 "\"The Bones\"" Maren Morris 2020
## 10 10 "\"Someone You Loved\"" Lewis Capaldi 2020
## # … with 90 more rows
##
## [[5]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Levitating\"" Dua Lipa 2021
## 2 2 "\"Save Your Tears\"" The Weeknd and Ariana Grande 2021
## 3 3 "\"Blinding Lights\"" The Weeknd 2021
## 4 4 "\"Mood\"" 24kGoldn featuring Iann Dior 2021
## 5 5 "\"Good 4 U\"" Olivia Rodrigo 2021
## 6 6 "\"Kiss Me More\"" Doja Cat featuring SZA 2021
## 7 7 "\"Leave the Door Open\"" Silk Sonic (Bruno Mars and … 2021
## 8 8 "\"Drivers License\"" Olivia Rodrigo 2021
## 9 9 "\"Montero (Call Me by Your Name)\"" Lil Nas X 2021
## 10 10 "\"Peaches\"" Justin Bieber featuring Dan… 2021
## # … with 90 more rows
df_2017_present <- bind_rows(df_all)
df_2017_present <- df_2017_present %>%
mutate(Title = str_remove_all(Title, pattern = "\"")) %>% ## get rid of \ in title
rename(no = No.,
title = Title,
artist = `Artist(s)`) ## make column names match with billboard package
wiki_tibble <- as_tibble(wiki_hot_100s) %>% ## convert billboard data to tibble
mutate(year = as.numeric(year),
no = as.integer(no)) ## change variable types to match with scraped data
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
hot100_df <- bind_rows(wiki_tibble, df_2017_present)
Exercise 6. Use the hot100_df to make either a bar plot or a lollipop chart of the most popular artists of the 2010s (2010 through 2019). It may be helpful to make this plot without looking back at the code for the 2000s plot until you get stuck.
top15_df <- hot100_df %>% filter(year >= 2010 & year <= 2019) %>%
group_by(artist) %>%
summarise(nsongs = n()) %>%
arrange(desc(nsongs)) %>%
slice(1:15) %>%
mutate(nsongs_ordered = fct_reorder(artist, nsongs))
ggplot(data = top15_df, aes(x = nsongs_ordered, y = nsongs)) +
geom_point() +
geom_segment(aes(x = nsongs_ordered, xend = nsongs_ordered, y = 0, yend = nsongs)) +
coord_flip() +
labs(x = "artist",
y = "number of songs")
Exercise 7. Much of the code to scrape the data, using purrr to iterate over the scrape, and then combining the list of data frames to a single data frame may be new. It is not expected that you are able to write this code on your own, but you should have an overall understanding of what the code is doing. Write 2-3 sentences that summarizes the overall purpose of the rvest and purrr code.
This code grabs the data from the internet in an html file and then turns it into something that r can read. Then it grabs the table from the site and returns it in r in a better version for r to work with.
5.2 More tidyverse Review: Happy Planet Index
library(tidyverse)
hpi_df <- read_csv("data/hpi-tidy.csv")
## Rows: 151 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, GovernanceRank, Region
## dbl (8): HPIRank, LifeExpectancy, Wellbeing, HappyLifeYears, Footprint, Happ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hpi_df
## # A tibble: 151 × 11
## HPIRank Country LifeExpectancy Wellbeing HappyLifeYears Footprint
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 109 Afghanistan 48.7 4.76 29.0 0.540
## 2 18 Albania 76.9 5.27 48.8 1.81
## 3 26 Algeria 73.1 5.24 46.2 1.65
## 4 127 Angola 51.1 4.21 28.2 0.891
## 5 17 Argentina 75.9 6.44 55.0 2.71
## 6 53 Armenia 74.2 4.37 41.9 1.73
## 7 76 Australia 81.9 7.41 65.5 6.68
## 8 48 Austria 80.9 7.35 64.3 5.29
## 9 80 Azerbaijan 70.7 4.22 39.1 1.97
## 10 146 Bahrain 75.1 4.55 43.5 6.65
## # … with 141 more rows, and 5 more variables: HappyPlanetIndex <dbl>,
## # Population <dbl>, GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
5.2.1 Making a Scatterplot and Labeling Points
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point()
hpi_us <- hpi_df %>% filter(Country == "United States of America")
hpi_us
## # A tibble: 1 × 11
## HPIRank Country LifeExpectancy Wellbeing HappyLifeYears Footprint
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 105 United States of Am… 78.5 7.16 61.3 7.19
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## # GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label(data = hpi_us, aes(label = Country)) ## specify
## data = hpi_us so geom_label only uses the observation in hpi_us
library(ggrepel)
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label_repel(data = hpi_us, aes(label = Country)) +
geom_point(data = hpi_us, size = 3, shape = 1) ## create a second point that is an open circle (shape = 1) with a larger size (size = 3) to surround the United States point on the scatterplot
Exercise 1. Change the code to label 3 countries of interest. Recall that you will need to use the | operator in the dplyr::filter() function.
label3 <- hpi_df %>% filter(Country == "New Zealand" | Country == "Australia" | Country == "Kenya")
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label_repel(data = label3, aes(label = Country)) +
geom_point(data = label3, size = 3, shape = 1)
5.2.2 plotly to Lable Points Interactively
## install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point()
ggplotly(plot1)
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing,
label = Country)) +
geom_point()
ggplotly(plot1, tooltip = "label")
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label_repel(data = hpi_us, aes(label = Country)) +
geom_point(data = hpi_us, size = 3, shape = 1) +
labs(title = "Countries with a Higher Ecological Footprint Tend to Have Citizens with Higher Wellbeing", ## add title
subtitle = "Wellbeing is on a 1-10 scale", ## add subtitle (smaller text size than the title)
caption = "Data Source: http://happyplanetindex.org/countries", ## add caption to the bottom of the figure
x = "Ecological Footprint", ## change x axis label
y = "Wellbeing") ## change y axis label
ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
geom_point() +
scale_colour_brewer(palette = "Accent")
ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
geom_point() +
scale_colour_viridis_d(option = "plasma")
ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears)) +
geom_point() +
facet_wrap( ~ Region)
library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
geom_point(aes(colour = species)) ## colour is good enough here
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
geom_point() +
facet_wrap( ~ species) ## faceting probably unnecessary: colour is better
## Warning: Removed 2 rows containing missing values (geom_point).
colour_bad <- tibble(x = rnorm(500, 0, 1), y = rnorm(500, 0, 1),
groupvar = c(rep("A", 50), rep("B", 50),
rep("C", 50), rep("D", 50),
rep("E", 50), rep("F", 50), rep("G", 50),
rep("H", 50), rep("I", 50), rep("J", 50)))
ggplot(data = colour_bad, aes(x = x, y = y, colour = groupvar)) +
geom_point() + ## can't distinguish anything really: colour is bad
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data = colour_bad, aes(x = x, y = y)) +
geom_point() +
geom_smooth(se = FALSE) +
facet_wrap( ~ groupvar) ## faceting better
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'